In [1]:

    
import ogr
import json
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from math import radians, cos, sin, asin, sqrt
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline



In [2]:

    
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r



In [3]:

    
train = pd.read_csv("data/train_without_noise.csv", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", parse_dates=["timestamp"])

Feature Engineering



In [4]:

    
# moscow additional data
driver = ogr.GetDriverByName("ESRI Shapefile")
shp = driver.Open("data/administrative-divisions-of-moscow/moscow_adm.shp")
layer = shp.GetLayer()



In [5]:

    
# coordinates have different sizes in list
shape_json = json.loads(layer.GetFeature(0).ExportToJson())
len_list = len(np.array(shape_json["geometry"]["coordinates"]).shape)

shape_json1 = json.loads(layer.GetFeature(1).ExportToJson())
len_list1 = len(np.array(shape_json1["geometry"]["coordinates"]).shape)
len_list, len_list1









    Out[5]:





(2, 3)



In [6]:

    
# sub_area
okrug = []
sub_area = []
coordinates = []
for i in range(layer.GetFeatureCount()):
    sub_area.append(layer.GetFeature(i).GetField("RAION"))
    okrug.append(layer.GetFeature(i).GetField("OKRUGS"))
    shape_json = json.loads(layer.GetFeature(i).ExportToJson())
    if len(np.array(shape_json["geometry"]["coordinates"]).shape) == 2:
        coordinates.append(np.mean(np.array(shape_json["geometry"]["coordinates"][0][0]), axis=0))
    else:
        coordinates.append(np.mean(np.array(shape_json["geometry"]["coordinates"][0]), axis=0))

Train



In [7]:

    
# timestamp
"""
train["year"] = train.timestamp.apply(lambda x: x.year)
train["month"] = train.timestamp.apply(lambda x: x.month)
train["day"] = train.timestamp.apply(lambda x: x.day)
train["week"] = train.timestamp.apply(lambda x: x.week)
train["week_day"] = train.timestamp.apply(lambda x: x.dayofweek)
"""
# to-do add season for housing prices









    Out[7]:





'\ntrain["year"] = train.timestamp.apply(lambda x: x.year)\ntrain["month"] = train.timestamp.apply(lambda x: x.month)\ntrain["day"] = train.timestamp.apply(lambda x: x.day)\ntrain["week"] = train.timestamp.apply(lambda x: x.week)\ntrain["week_day"] = train.timestamp.apply(lambda x: x.dayofweek)\n'



In [8]:

    
# house
train["diff_max_floor"] = train.max_floor - train.floor
train["perc_max_floor"] = train.floor / train.max_floor
train["diff_full_life_sq"] = train.full_sq - train.life_sq
train["perc_full_life_sq"] = train.life_sq / train.full_sq
train["diff_full_kitch_sq"] = train.full_sq - train.kitch_sq
train["perc_full_kitch_sq"] = train.kitch_sq / train.full_sq
train["diff_full_life_kitch_sq"] = train.full_sq - train.life_sq - train.kitch_sq
train["perc_full_extra_sq"] = (train.life_sq + train.kitch_sq) / train.full_sq
train["sum_life_kitch"] = train.life_sq + train.kitch_sq
train["roomsize"] = (train.life_sq - train.kitch_sq) / train.num_room
train["age_at_sale"] = (train.timestamp.apply(lambda x: x.year) - train.build_year).apply(lambda x: x if x < 1500 else None)



In [9]:

    
# school 
train["perc_preschool"] = train.children_preschool / train.preschool_quota
train["perc_school"] = train.children_school / train.school_quota



In [10]:

    
# sub-area

kremlin = (37.617499, 55.752023) # long / # lat
moscow_uni = (37.528798, 55.704098)

moscow_df = pd.DataFrame({"sub_area": sub_area, "okrug": okrug, "coordinates": coordinates})
moscow_df["distance_from_kremlin"] = moscow_df.coordinates.apply(lambda x: haversine(x[0], x[1], kremlin[0], kremlin[1]))
moscow_df["distance_from_moscow_uni"] = moscow_df.coordinates.apply(lambda x: haversine(x[0], x[1], moscow_uni[0], moscow_uni[1]))
                                                                                        
moscow_df.drop("coordinates", axis=1, inplace=True)
train = pd.merge(train, moscow_df, how="left", on="sub_area")

# to-do maybe add distance to financial district



In [11]:

    
# reset index
train.set_index("id", inplace=True)

Test



In [12]:

    
# timestamp
"""
test["year"] = test.timestamp.apply(lambda x: x.year)
test["month"] = test.timestamp.apply(lambda x: x.month)
test["day"] = test.timestamp.apply(lambda x: x.day)
test["week"] = test.timestamp.apply(lambda x: x.week)
test["week_day"] = test.timestamp.apply(lambda x: x.dayofweek)
"""









    Out[12]:





'\ntest["year"] = test.timestamp.apply(lambda x: x.year)\ntest["month"] = test.timestamp.apply(lambda x: x.month)\ntest["day"] = test.timestamp.apply(lambda x: x.day)\ntest["week"] = test.timestamp.apply(lambda x: x.week)\ntest["week_day"] = test.timestamp.apply(lambda x: x.dayofweek)\n'



In [13]:

    
# house
test["diff_max_floor"] = test.max_floor - test.floor
test["perc_max_floor"] = test.floor / test.max_floor
test["diff_full_life_sq"] = test.full_sq - test.life_sq
test["perc_full_life_sq"] = test.life_sq / test.full_sq
test["diff_full_kitch_sq"] = test.full_sq - test.kitch_sq
test["perc_full_kitch_sq"] = test.kitch_sq / test.full_sq
test["diff_full_life_kitch_sq"] = test.full_sq - test.life_sq - test.kitch_sq
test["perc_full_extra_sq"] = (test.life_sq + test.kitch_sq) / test.full_sq
test["sum_life_kitch"] = test.life_sq + test.kitch_sq
test["roomsize"] = (test.life_sq - test.kitch_sq) / test.num_room
test["age_at_sale"] = (test.timestamp.apply(lambda x: x.year) - test.build_year).apply(lambda x: x if x < 1500 else None)



In [14]:

    
# school 
test["perc_preschool"] = test.children_preschool / test.preschool_quota
test["perc_school"] = test.children_school / test.school_quota



In [15]:

    
# sub-area

kremlin = (37.617499, 55.752023) # long / # lat
moscow_uni = (37.528798, 55.704098)

moscow_df = pd.DataFrame({"sub_area": sub_area, "okrug": okrug, "coordinates": coordinates})
moscow_df["distance_from_kremlin"] = moscow_df.coordinates.apply(lambda x: haversine(x[0], x[1], kremlin[0], kremlin[1]))
moscow_df["distance_from_moscow_uni"] = moscow_df.coordinates.apply(lambda x: haversine(x[0], x[1], moscow_uni[0], moscow_uni[1]))
                                                                                        
moscow_df.drop("coordinates", axis=1, inplace=True)
test = pd.merge(test, moscow_df, how="left", on="sub_area")

# to-do maybe add distance to financial district



In [16]:

    
# reset index
test.set_index("id", inplace=True)

Transformation



In [17]:

    
y_train = train["price_doc"]
x_train = train.drop(["timestamp", "price_doc"], axis=1)



In [18]:

    
# transform non-numerical variables
for c in x_train.columns:
    if x_train[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_train[c].values)) 
        x_train[c] = lbl.transform(list(x_train[c].values))



In [19]:

    
# replace missing values with mean values
"""
for c in x_train.columns:
    x_train[c].fillna(x_train[c].mean(), inplace=True)
"""









    Out[19]:





'\nfor c in x_train.columns:\n    x_train[c].fillna(x_train[c].mean(), inplace=True)\n'



In [20]:

    
x_test = test.drop(["timestamp"], axis=1)



In [21]:

    
# transform non-numerical variables
for c in x_test.columns:
    if x_test[c].dtype == "object":
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_test[c].values)) 
        x_test[c] = lbl.transform(list(x_test[c].values))



In [22]:

    
# replace missing values with mean values
"""
for c in x_test.columns:
    x_test[c].fillna(x_test[c].mean(), inplace=True)
"""









    Out[22]:





'\nfor c in x_test.columns:\n    x_test[c].fillna(x_test[c].mean(), inplace=True)\n'



In [23]:

    
xgb_params = {
    "eta": 0.05,
    "max_depth": 5,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "objective": "reg:linear",
    "eval_metric": "rmse",
    "silent": 1,
    "seed":42
}



In [24]:

    
dtrain = xgb.DMatrix(x_train, y_train)



In [25]:

    
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20, 
                   verbose_eval=50, show_stdv=False)









    



[0]	train-rmse:8.12198e+06	test-rmse:8.12919e+06
[50]	train-rmse:2.49332e+06	test-rmse:2.89306e+06
[100]	train-rmse:2.17841e+06	test-rmse:2.72201e+06
[150]	train-rmse:2.0638e+06	test-rmse:2.6852e+06
[200]	train-rmse:1.98429e+06	test-rmse:2.66909e+06
[250]	train-rmse:1.91302e+06	test-rmse:2.65606e+06
[300]	train-rmse:1.84927e+06	test-rmse:2.65116e+06
[350]	train-rmse:1.79188e+06	test-rmse:2.645e+06
[400]	train-rmse:1.73789e+06	test-rmse:2.64353e+06
[450]	train-rmse:1.68854e+06	test-rmse:2.64176e+06
[500]	train-rmse:1.64378e+06	test-rmse:2.64068e+06



In [26]:

    
cv_output[["train-rmse-mean", "test-rmse-mean"]].plot()
pass



In [27]:

    
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)



In [28]:

    
xgb.plot_importance(model, max_num_features=20)
pass



In [29]:

    
pred = model.predict(dtrain)



In [30]:

    
error = np.mean(abs(pred - y_train)) / len(y_train)



In [31]:

    
n = 1 / len(y_train)
rmsle = np.sqrt(1/n * np.sum(np.power(np.log(pred + 1) - np.log(y_train.values + 1), 2)))









    



/Users/datitran/anaconda/envs/kaggle/lib/python3.5/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in log
  from ipykernel import kernelapp as app



In [32]:

    
print("RMSLE: {rmsle}, Error: {error}".format(rmsle=rmsle, error=error))









    



RMSLE: nan, Error: 38.878103847606425



In [33]:

    
dtest = xgb.DMatrix(x_test)



In [34]:

    
y_predict = model.predict(dtest)



In [35]:

    
output = pd.DataFrame({"id": x_test.index, "price_doc": y_predict})



In [36]:

    
output.to_csv("submissions_add_features.csv", index=False)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: